###set your working directory to the source file localtion###
d.HLL<-read.csv("../data/HLL-TEST-2018_08.44_numeric_values.csv")
header <- names(d.HLL)
d.HLL<- read.csv("../data/HLL-TEST-2018_08.44_numeric_values.csv", col.names = header, skip = 2)
item.code.keys <- read.csv("../data/itemcodekeyHLL.csv")#add conditions
todrop <- c("StartDate","EndDate","Finished","RecordedDate","RecipientLastName","RecipientFirstName", "RecipientEmail","ExternalReference" ,"LocationLatitude","LocationLongitude" ,"DistributionChannel", "IPAddress","UserLanguage")
d.HLL%>%
select(-one_of(todrop)) %>%
rename(ExpLists = FL_4_DO, SubGender = Q1.6, SubAge =Q1.7, L1= Q1.8 , Education=Q1.9,Dialects= Q1.15_1, SubID=ResponseId , L2=Q1.16_1 )%>%
filter(Status == 0 & Progress==100 ) %>% #real participants+finished
select(-Status,-Progress,-Q1.3,-Q1.4,- Q1.17,-starts_with("Q2."),-starts_with("Q8.")) %>%
gather(key = Qnumber,value=choice,Q3.1:Q7.161,na.rm = T) %>%#widd to long
arrange(SubID, Qnumber) %>%
group_by(SubID) %>% #add scores
mutate(zscore = scale(choice)) %>%
ungroup()%>%
mutate_at(vars(matches("_DO")), as.character)%>%
mutate_at(vars(matches("ExpLists")), as.character)%>%
mutate(TrialOrder = case_when(ExpLists == "list-pair-1" ~ list.pair.1_DO,
ExpLists == "list-pair-2" ~ list.pair.2_DO,
ExpLists == "list-filler-a" ~ list.filler.a_DO,
ExpLists == "list-filler-b" ~ list.filler.b_DO,
ExpLists == "list-filler-c" ~ list.filler.c_DO))%>%
select(-matches("_DO"))%>%
inner_join(y=item.code.keys)->d.HLL.cond #add experimental conditions
#add trial order sequence
seqnum=as.character(c(1:158))
d.HLL.cond %>%
select(SubID,TrialOrder) %>%
distinct()%>%
separate(TrialOrder, into=seqnum, sep = "\\|")%>%
gather(key = QSEQ,value=Qnumber,"1":"158",na.rm = T) %>%#widd to long
inner_join(y=d.HLL.cond)->d.HLL.cond## # A tibble: 5 x 2
## ExpLists count
## <chr> <int>
## 1 list-filler-a 16
## 2 list-filler-b 16
## 3 list-filler-c 18
## 4 list-pair-1 47
## 5 list-pair-2 51
#subjects age infor
d.HLL.cond %>%
group_by(SubGender) %>%
summarise(count = n_distinct(SubID),age_mean = mean(as.numeric(as.character(SubAge))),age_SD = sd(as.numeric(as.character(SubAge))))## # A tibble: 2 x 4
## SubGender count age_mean age_SD
## <int> <int> <dbl> <dbl>
## 1 1 38 24.9 4.81
## 2 2 110 23.5 5.76
#subjects who have "linguistics" related background
d.HLL.cond %>%
filter(grepl("语言",Q1.12)==T|grepl("语言",Q1.13)==T) %>%
summarise(count = n_distinct(SubID))## # A tibble: 1 x 1
## count
## <int>
## 1 7
#check "catch" trials
#exclude subjects failed any catch-question
d.HLL.cond %>%
filter(Category=="catch") %>%
select(SubID,choice,SenType,ExpLists) %>%
mutate(CorrectA = ifelse(SenType == "catch2", 2, 5)) %>%
filter(choice!=CorrectA) %>%
select(SubID)%>%
unique()%>%
droplevels()%>%
.$SubID->sub.exclude
#subjects exclude in each group
d.HLL.cond %>%
filter(Category=="catch") %>%
select(SubID,choice,SenType,ExpLists) %>%
mutate(CorrectA = ifelse(SenType == "catch2", 2, 5)) %>%
filter(choice!=CorrectA) %>%
group_by(ExpLists) %>%
summarise(count = n_distinct(SubID))## # A tibble: 5 x 2
## ExpLists count
## <chr> <int>
## 1 list-filler-a 2
## 2 list-filler-b 1
## 3 list-filler-c 5
## 4 list-pair-1 7
## 5 list-pair-2 3
d.HLL.cond %>%
filter(!SubID%in%sub.exclude) %>%
filter(Category!="catch") %>%
mutate(SenCon=ifelse(SenType=="g","good",
ifelse(SenType=="q","questionable","bad")))->d.HLL.all
d.HLL.all %>%
group_by(SenCon) %>%
summarise(MeanScore=mean(choice))## # A tibble: 3 x 2
## SenCon MeanScore
## <chr> <dbl>
## 1 bad 3.10
## 2 good 5.19
## 3 questionable 4.22
#theme for plots
theme.yuhang <- theme_bw()+
theme(axis.text=element_text(size=12),axis.title=element_text(size=14,face="bold"))+
theme(legend.text = element_text(size=12))+
theme(legend.title = element_text(size=14, face="bold"))+
theme(legend.title.align = 0.5)+
theme(strip.text = element_text(size=12))+
theme(title = element_text(size=14, face="bold"))+
theme(plot.title = element_text(hjust = 0.5))
theme_set(theme.yuhang)ggplot(d.HLL.all, aes(x = SenCon, y=as.numeric(choice)))+
stat_summary(geom = "bar",fun.y = "mean", aes(fill=SenCon),colour="black")+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",width = 0.1)+
scale_x_discrete(limits=c("bad","questionable","good"))+
scale_fill_manual(limits=c("bad","questionable","good"),values=c("#F8766D", "#00BA38", "#00BFC4"))+
coord_cartesian(ylim =c(0,7))+
scale_y_continuous(breaks=seq(0,7,1))+
labs(x = "Sentence Type", y="Acceptability Rating")+
guides(colour=FALSE,fill=FALSE)->p.overall.ex1
p.overall.ex1d.HLL.all %>%
mutate(contrast_BvQ =ifelse(SenCon == "bad",1,0))%>%
mutate(contrast_GvQ =ifelse(SenCon == "good",1,0))->d.HLL.all
#use question as the baseline level
m.alldata<-lmer(zscore~contrast_BvQ+contrast_GvQ+(1+contrast_BvQ+contrast_GvQ |SubID)+(1+contrast_BvQ+contrast_GvQ|ItemNum),data=d.HLL.all)
summary(m.alldata)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.05217386 0.1123596 32.57582 -0.4643472 0.6454875138
## contrast_BvQ -0.43584156 0.1182534 33.77810 -3.6856569 0.0007943506
## contrast_GvQ 0.54985892 0.1138118 30.74540 4.8312987 0.0000354678
# m.alldata.raw<-lmer(choice~contrast_BvQ+contrast_GvQ+(1+contrast_BvQ+contrast_GvQ|SubID)+(1+contrast_BvQ+contrast_GvQ|ItemNum),data=d.HLL.all)
# summary(m.alldata.raw)$coefficients
# use bad as the baseline level
#summary(lmer(choice~SenCon+(1+SenCon|SubID)+(1+SenCon|ItemNum),data=d.HLL.all))
# m.allbadasbaseline<-lmer(choice~SenCon+(1+SenCon|SubID)+(1+SenCon|ItemNum),data=d.HLL.all)
# summary(m.allbadasbaseline)
m.allbadasbaseline.z<-lmer(zscore~SenCon+(1+SenCon|SubID)+(1+SenCon|ItemNum),data=d.HLL.all)
summary(m.allbadasbaseline.z)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.4880032 0.03311473 285.09820 -14.736741 6.241347e-37
## SenCongood 0.9857159 0.04506640 280.49397 21.872522 1.423926e-62
## SenConquestionable 0.4342545 0.11818630 33.66681 3.674322 8.225847e-04
#we will look at "other" lists only!
# I also tried all data
d.HLL.all %>%
filter(Category!="pair")%>%
group_by(ItemNum, SenCon) %>%
summarise(Avescore=mean(choice)) %>%
ggplot(aes(x = ItemNum, y=Avescore,label=ItemNum))+
geom_text(aes(colour=SenCon))+
labs(x = "Contrast Number", y="Average Acceptability Rating")+
scale_color_manual(name="Sentence Type",limits=c("bad","questionable","good"),values=c("#F8766D", "#00BA38", "#00BFC4"))+
geom_hline(yintercept = 4,colour="grey",linetype = 2,size = 0.3)+
theme_bw()+
theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),line = element_blank())#we will look at the "pair" first
d.HLL.cond %>%
filter(!SubID%in%sub.exclude) %>%
filter(Category!="catch") %>%
mutate(SenCon=ifelse(SenType=="g","good",
ifelse(SenType=="q","questionable","bad"))) %>%
filter(ExpLists %in% c("list-pair-1","list-pair-2")) -> d.HLL.pair #plot for all pairs
ggplot(d.HLL.pair, aes(x = SenCon, y=as.numeric(choice)))+
geom_jitter(alpha = 0.1, aes(colour=SenCon))+
stat_summary(geom = "line", fun.y = "mean", aes(group=ItemNum),alpha=0.2) +
stat_summary(geom = "point",fun.y = "mean", aes(colour=SenCon),size = 6)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",width = 0.1)+
scale_y_continuous(breaks=seq(0,7,1))+
labs(x = "Sentence Type", y="Acceptability Rating")+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
guides(colour=FALSE,fill=F)->p.pair.all
p.pair.all#by-item plot
ggplot(d.HLL.pair,aes(x = SenCon, y=as.numeric(choice)))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon))+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",width = 0.1)+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-1,8,2))+
labs(x = "Sentence Type", y="Acceptability Rating")+
guides(colour=FALSE) +
theme_bw()->p.pair.byitem
#p.pair.byitem see plots folder for item-by-item plot#Z-score version plot for all pairs
ggplot(d.HLL.pair, aes(x = SenCon, y=as.numeric(zscore)))+
geom_jitter(alpha = 0.05, aes(colour=SenCon))+
stat_summary(geom = "line", fun.y = "mean", aes(group=ItemNum),alpha=0.2) +
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=6)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",width = 0.1)+
labs(x = "Sentence Type", y="Acceptability Rating")+
geom_hline(yintercept = 0,colour="blue",linetype = 2)+
guides(colour=FALSE)->p.pair.all.z
p.pair.all.z#z-score by-item plot
ggplot(d.HLL.pair,aes(x = SenCon, y=as.numeric(zscore)))+
# geom_violin(alpha = 0.5,aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=2)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",width = 0.1)+
geom_hline(yintercept = 0,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-3,3,1))+
labs(x = "Sentence Type", y="Acceptability Rating")+
guides(colour=FALSE) +
theme_bw()->p.pair.byitem.z
#p.pair.byitem.z See plots folder for item-by-item plot# difference scores between good and bad
d.HLL.pair %>%
group_by(ItemNum, SenCon) %>%
summarise(Avescore=mean(choice)) %>%
mutate(DiffScore = Avescore-lag(Avescore)) %>%
filter(!is.na(DiffScore)) %>%
ggplot(aes(x = ItemNum, y=DiffScore,label = ItemNum))+
geom_text(size=3)+
labs(x = "Contrast Number", y="Acceptability Rating Difference")+
geom_hline(yintercept = 0,colour="blue",linetype = 2)+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
line = element_blank())->p.pair.differscore
p.pair.differscoreggplot(d.HLL.pair,aes(x = as.numeric(QSEQ) , y=as.numeric(choice),colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean")+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",aes(colour=SenCon),width = 0.1,alpha=0.5)+
geom_smooth(method = "lm")+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
labs(x = "Trial Sequence", y="Mean Acceptability Rating")+
guides(colour=guide_legend(title="Sentence Type")) ->p.pair.sequence
p.pair.sequence#coding variables
d.HLL.pair$SenTypeC<-ifelse(d.HLL.pair$SenCon=="good", 1,-1)
d.HLL.pair$QSEQ<-as.numeric(d.HLL.pair$QSEQ)
d.HLL.pair$SubAge<-as.numeric(d.HLL.pair$SubAge)
#check control variables
m.control<-lmer(choice~QSEQ+SubGender+SubAge+Education+(1|SubID)+(1|ItemNum),data=d.HLL.pair )
summary(m.control)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.4601097378 0.5302469412 88.43602 8.4113823 6.452441e-13
## QSEQ -0.0003828485 0.0003483153 13494.62636 -1.0991436 2.717251e-01
## SubGender -0.2343996554 0.1577978803 84.00309 -1.4854424 1.411700e-01
## SubAge 0.0330117856 0.0206249590 84.00373 1.6005746 1.132255e-01
## Education -0.0517628712 0.1191219145 84.00278 -0.4345369 6.650128e-01
m.control.z<-lmer(zscore~QSEQ+SubGender+SubAge+Education+(1|SubID)+(1|ItemNum),data=d.HLL.pair )
summary(m.control.z)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.0209248030 0.0731361725 1800.817 0.28610744 0.7748287
## QSEQ -0.0001371227 0.0001642316 13578.230 -0.83493484 0.4037691
## SubGender -0.0052716814 0.0183615016 13567.129 -0.28710514 0.7740362
## SubAge 0.0004133445 0.0024000092 13567.179 0.17222622 0.8632623
## Education -0.0004701398 0.0138609299 13567.144 -0.03391835 0.9729428
#all pairs raw score models
m.all<-lmer(choice~SenTypeC+(1+SenTypeC|SubID)+(1+SenTypeC|ItemNum),data=d.HLL.pair)
summary(m.all)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.103809 0.10295624 234.6610 39.85974 1.854976e-106
## SenTypeC 1.084347 0.06223386 223.4381 17.42375 1.623510e-43
#all pairs z score models
m.all.z<-lmer(zscore~SenTypeC+(1+SenTypeC|SubID)+(1+SenTypeC|ItemNum),data=d.HLL.pair)
summary(m.all.z)$coefficients## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.002648455 0.03877292 156.9569 0.06830681 9.456283e-01
## SenTypeC 0.515040713 0.02659701 186.8541 19.36460814 1.526526e-46
#item by item analysis
itemlist<-unique(d.HLL.pair$ItemNum)
#run models for each item and print t values for non-sig items
nonsigitem <- c()
for (i in itemlist){
m.a<-summary(lm(choice~SenTypeC,data=subset(d.HLL.pair, ItemNum==i)))$coef[, "t value"]
if (getElement(m.a, "SenTypeC") < 2){print(paste(i," ",formatC(getElement(m.a, "SenTypeC"),digits=2,format="f")))
nonsigitem[i]<-i}
}## [1] "c4-27 1.09"
## [1] "c3-f9 1.77"
## [1] "c5-18 1.32"
## [1] "c4-56b 1.90"
## [1] "c5-43 1.23"
## [1] "c2-34 1.23"
## [1] "c8-93c -0.29"
## [1] "c6-98 1.30"
## [1] "c7-28 -1.87"
## [1] "c1-50a -2.09"
## [1] "c6-99 0.67"
## [1] "c6-80a 0.51"
## [1] "c6-83 -0.19"
## [1] "c4-39 1.28"
## [1] "c4-32 1.22"
## [1] "c8-60 -1.70"
## [1] "c8-77 0.48"
## [1] "c8-93a 1.78"
## [1] "c5-f9 1.89"
## [1] "c8-08 0.80"
## [1] "c8-71b -0.77"
## [1] "c7-102 -1.08"
## [1] "c1-28 1.21"
## [1] "c6-24 0.32"
## [1] "c8-40a 0.56"
## [1] "c3-54 -0.06"
## [1] "26 non-sig/reversed in raw data"
#how about z-scores?
nonsigitem.z <- c()
for (i in itemlist){
m.z<-summary(lm(zscore~SenTypeC,data=subset(d.HLL.pair, ItemNum==i)))$coef[, "t value"]
if (getElement(m.z, "SenTypeC") < 2){print(paste(i," ",formatC(getElement(m.z, "SenTypeC"),digits=2,format="f")))
nonsigitem.z[i]<-i}
}## [1] "c4-27 -0.18"
## [1] "c4-56b 1.16"
## [1] "c2-34 1.94"
## [1] "c8-93c 0.19"
## [1] "c6-98 0.47"
## [1] "c7-28 -2.82"
## [1] "c1-50a -3.67"
## [1] "c6-99 1.98"
## [1] "c6-80a 1.20"
## [1] "c6-83 -1.07"
## [1] "c8-60 -1.13"
## [1] "c8-77 0.94"
## [1] "c8-08 -0.61"
## [1] "c8-71b -0.02"
## [1] "c7-102 0.29"
## [1] "c6-24 1.22"
## [1] "c3-54 0.77"
## [1] "17 non-sig/reversed in z-transformed data"
nonsig<-unname(nonsigitem)
nonsig.z<-unname(nonsigitem.z)
print("non-sig/reversed in both raw and z-transformed data")## [1] "non-sig/reversed in both raw and z-transformed data"
## [1] "c4-27" "c4-56b" "c2-34" "c8-93c" "c6-98" "c7-28" "c1-50a"
## [8] "c6-99" "c6-80a" "c6-83" "c8-60" "c8-77" "c8-08" "c8-71b"
## [15] "c7-102" "c6-24" "c3-54"
## [1] "non-sig/reversed in raw data only"
## [1] "c3-f9" "c5-18" "c5-43" "c4-39" "c4-32" "c8-93a" "c5-f9" "c1-28"
## [9] "c8-40a"
nonsig<-unname(nonsigitem)
#by-item plot for nonsig
ggplot(subset(d.HLL.pair,ItemNum %in% nonsig ),aes(x = SenCon, y=as.numeric(choice)))+
geom_jitter(alpha = 0.15, aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=2)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",colour="black",width = 0.2)+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
geom_rect(data = subset(d.HLL.pair,ItemNum %in%bothnonsig),
xmin = -Inf,xmax = Inf,ymin = -Inf,ymax = Inf,
colour="red",size=1.5,alpha = 0) +
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-1,7,2))+
labs(x = "Sentence Type", y="Acceptability Rating",title="Non-significant/reversed contrasts")+
guides(colour=FALSE) +
theme_bw()->p.nonsig.byitem
p.nonsig.byitem#show items that non-sig in both rawdata and z-data
ggplot(subset(d.HLL.pair,ItemNum %in% nonsig.z),aes(x = SenCon, y=as.numeric(choice)))+
geom_jitter(alpha = 0.15, aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=1.5)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",colour="black",width = 0.25)+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-1,7,2))+
labs(x = "Sentence Type", y="Acceptability Rating")+
guides(colour=FALSE)->p.nonsig.inboth.byitem
p.nonsig.inboth.byitem#show items that non-sig in only raw data
ggplot(subset(d.HLL.pair,ItemNum %in% onlyraw),aes(x = SenCon, y=as.numeric(choice)))+
geom_jitter(alpha = 0.15, aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=1.5)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",colour="black",width = 0.25)+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-1,7,2))+
labs(x = "Sentence Type", y="Acceptability Rating")+
guides(colour=FALSE)->p.nonsig.onlyraw.byitem
#p.nonsig.onlyraw.byitem#by-item plot for nonsig~zcores&raw
ggplot(subset(d.HLL.pair,ItemNum %in% nonsig.z ),aes(x = SenCon, y=as.numeric(zscore)))+
geom_jitter(alpha = 0.2, aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=2)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",aes(colour=SenCon),width = 0.2)+
geom_hline(yintercept = 0,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
labs(x = "Sentence Type", y="Acceptability rating (z-transformed)",title="Non-significant/reversed contrasts")+
guides(colour=FALSE) +
theme_bw()->p.nonsig.byitem.z
p.nonsig.byitem.zSee “Appendix” of the paper for details
## [1] "c4-27" "c3-f9" "c5-18" "c4-56b" "c5-43" "c2-34" "c8-93c"
## [8] "c6-98" "c7-28" "c1-50a" "c6-99" "c6-80a" "c6-83" "c4-39"
## [15] "c4-32" "c8-60" "c8-77" "c8-93a" "c5-f9" "c8-08" "c8-71b"
## [22] "c7-102" "c1-28" "c6-24" "c8-40a" "c3-54"
# getting the control pairs
itemlist<-unique(d.HLL.pair$ItemNum)
itemtvalue <- c()
itemname <- c()
x<-1
for (i in itemlist){
m.a<-summary(lm(choice~SenTypeC,data=subset(d.HLL.pair, ItemNum==i)))$coef[, "t value"]
itemtvalue[x]<-getElement(m.a, "SenTypeC")
itemname[x]<-i
x<-x+1
}
d.itemt <- data.frame(itemnames=itemname,tvalue=itemtvalue)
d.itemt %>%
arrange(-tvalue) %>%
top_n(29) %>% ## used c5-69 and c8-71c as expt instruction examples
select(itemnames)%>%
unique() %>%
.$itemnames->controlgroup
remove <- c ("c5-69", "c8-71c")
controlgroup.p<-controlgroup[! controlgroup %in% remove]ggplot(subset(d.HLL.pair,ItemNum %in% controlgroup.p ),aes(x = SenCon, y=as.numeric(choice)))+
geom_jitter(alpha = 0.05, aes(colour=SenCon))+
stat_summary(geom = "point",fun.y = "mean", aes(color=SenCon),size=2)+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",colour="black",width = 0.2)+
geom_hline(yintercept = 4,colour="blue",linetype = 2)+
facet_wrap(~ItemNum)+
scale_y_continuous(breaks=seq(-1,7,2))+
labs(x = "Sentence Type", y="Acceptability Rating",title="Forced choice control 1")+
guides(colour=FALSE) +
theme_bw()->p.control.byitem
p.control.byitemd.fc<-read.csv("../data/HLL-ForcedChoice-20180922.csv")
d.fc<-tail(d.fc,-2)
item.code.FC<-read.csv("../data/item.code.FC.csv")#now we check catch trials
d.fc%>%
filter(Status == 0 & Progress==100 ) %>% #non-testing+finished
select(ResponseId,starts_with("Q3.55"),starts_with("Q3.56"))%>%
mutate(Correct55 = ifelse(Q3.55_DO == "2|1", 2, 1)) %>%
mutate(Correct56 = ifelse(Q3.56_DO == "2|1", 1, 2)) %>%
# filter(Q3.55!=Correct55 & Q3.56!=Correct56)%>% ## failed both catch q 7 subjects
filter(Q3.55!=Correct55 | Q3.56!=Correct56)%>% ## failed any catch q (22 subjects)
select(ResponseId)%>%
unique()%>%
droplevels()%>%
.$ResponseId->subtoexclude
print(paste0(n_distinct(subtoexclude), " subjects excluded"))## [1] "22 subjects excluded"
d.fc%>%
select(-one_of(todrop))%>%
rename(SubGender = Q1.6, SubAge =Q1.7, L1= Q1.8 ,
Education=Q1.9,Dialects= Q1.15_1, SubID=ResponseId ,
L2=Q1.16_1,ExpOrder=Experiment_DO)%>%
filter(Status == 0 & Progress==100 ) %>% #non-testing+finished
select(-Status,-Progress,-Q1.3,-Q1.4,- Q1.17,-starts_with("Q2."),-starts_with("Q4."),-FL_4_DO) %>%
filter(!SubID%in%subtoexclude) %>%
select(-ends_with("_DO"),-Q3.55,-Q3.56)%>%
gather(key = Qnumber,value=choice,Q3.1:Q3.54)%>%
inner_join(y=item.code.FC)%>%
mutate(GroupNew=ifelse(Group=="test"&ItemNum%in%nonsig.z,"test",ifelse(Group=="control"&!ItemNum%in%nonsigitem,"control 1", "control 2")))->d.fc.ready#3 groups now control 1 control2() and test #Participants# mean age
d.fc.ready%>%
group_by(SubGender) %>%
summarise(count = n_distinct(SubID),age_mean = mean(as.numeric(as.character(SubAge))),age_SD = sd(as.numeric(as.character(SubAge))))## # A tibble: 2 x 4
## SubGender count age_mean age_SD
## <fct> <int> <dbl> <dbl>
## 1 1 15 25 2.25
## 2 2 49 23.6 1.90
#Participants who have linguistics related background
d.fc.ready %>%
filter(grepl("语言",Q1.12)==T|grepl("语言",Q1.13)==T) %>%
summarise(count = n_distinct(SubID))## count
## 1 8
#visualiztion: general pattern
d.fc.ready %>%
mutate(answer=ifelse(choice==GoodChoice,"Good","Bad"))%>%
group_by(GroupNew,ItemNum,answer) %>%
tally()%>%
mutate(Proportion = n / sum(n),se = sqrt(Proportion*(1-Proportion)/n))%>%
ggplot(aes(x=GroupNew,y=Proportion,fill=answer))+
stat_summary(geom = "bar", fun.y = "mean", aes(fill=answer),color="black",position = position_dodge())+
stat_summary(geom = "errorbar", fun.data = "mean_cl_boot",position = position_dodge(width = 0.9),width=0.3)+
scale_y_continuous(labels=percent_format())+
labs(y="Proportion of Choice",x="Group")->p.overall.ex2
p.overall.ex2#Control group 1: Individual-item
d.fc.ready %>%
mutate(answer=ifelse(choice==GoodChoice,"Good","Bad"))%>%
group_by(Group,ItemNum,answer) %>%
tally()%>%
mutate(Proportion = n / sum(n),se = sqrt(Proportion*(1-Proportion)/n))%>%
filter(Group=="control")%>%
ggplot(aes(x=answer,y=Proportion,fill=answer))+
stat_summary(geom = "bar", fun.y = "mean", aes(fill=answer),color="black",position = position_dodge())+
scale_y_continuous(labels=percent_format())+
facet_wrap(~ItemNum,ncol = 7)+
labs(y="Proportion of Choice",title="Control group 1: Individual-item")+
scale_fill_discrete(name="Answer")+
theme_bw(base_size = 9)+
theme(legend.position = c(0.95, 0.09))d.fc.ready %>%
mutate(answer=ifelse(choice==GoodChoice,"Good","Bad"))%>%
group_by(GroupNew,ItemNum,answer) %>%
tally()%>%
mutate(Proportion = n / sum(n),se = sqrt(Proportion*(1-Proportion)/n))%>%
filter(GroupNew=="test")->d.fc.testgroup
ggplot(data=d.fc.testgroup,aes(x=answer,y=Proportion))+
stat_summary(geom = "bar", fun.y = "mean", aes(fill=answer),color="black",position = position_dodge())+
geom_hline(yintercept = 0.5,colour="blue",linetype = 2)+
scale_y_continuous(labels=percent_format())+
facet_wrap(~ItemNum,ncol = 6)+
labs(y="Proportion of Choice",title="Test group: Individual-item")+
scale_fill_discrete(name="Answer")+
theme_bw(base_size = 9)+
theme(legend.position = c(0.95, 0.09))#Comparing the "good" vs. "bad" choices. Does the "good" sigifictly higher than the "bad"(or 50%)?
#overall: all items
#control group
d.fc.ready %>%
group_by(SubID,GroupNew) %>%
tally() %>%
inner_join(d.fc.ready)%>%
mutate(answerC=ifelse(choice==GoodChoice,1,0)) %>%
mutate(emplogit=log((answerC+0.5)/(n-answerC+0.5)))->d.fc.ready.model#Three groups: control 1 2 and test
summary(glmer(answerC~1+(1|SubID)+(1|ItemNum),subset(d.fc.ready.model,GroupNew=="control 1"),family = binomial(link = "logit")))$coefficient## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.798358 1.162663 5.847229 4.998286e-09
summary(glmer(answerC~1+(1|SubID)+(1|ItemNum),subset(d.fc.ready.model,GroupNew=="control 2"),family = binomial(link = "logit")))$coefficient## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.747137 0.2617029 6.676031 2.455007e-11
summary(glmer(answerC~1+(1|SubID)+(1|ItemNum),subset(d.fc.ready.model,GroupNew=="test"),family = binomial(link = "logit")))$coefficient## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8110841 0.3478099 2.331975 0.019702
#item-by-item
#control group 1
itemlist.fc<-unique(subset(d.fc.ready.model,GroupNew=="control 1")$ItemNum)
#run models for each item and print z values
#controllist.fc <- c()
for (i in itemlist.fc){
m.fc<-summary(glm(answerC~1,data=subset(d.fc.ready.model,ItemNum==i),family = binomial(link = "logit")))$coef[, "z value"]
print(paste(i," ",formatC(m.fc,digits=2,format="f")))
}## [1] "c1-43 0.00"
## [1] "c2-01b 0.00"
## [1] "c2-40 0.00"
## [1] "c3-31 4.11"
## [1] "c3-57 0.00"
## [1] "c3-55 0.00"
## [1] "c3-56 0.00"
## [1] "c3-58 0.00"
## [1] "c5-09 4.11"
## [1] "c5-17 4.11"
## [1] "c5-24 0.00"
## [1] "c5-47 0.00"
## [1] "c5-54 0.00"
## [1] "c6-60 0.00"
## [1] "c6-61 0.00"
## [1] "c6-87a 4.11"
## [1] "c6-117 0.00"
## [1] "c7-06 4.11"
## [1] "c7-57 0.00"
## [1] "c7-17 0.00"
## [1] "c7-19 0.00"
## [1] "c7-30 0.00"
## [1] "c7-58 4.11"
## [1] "c7-59 4.11"
## [1] "c8-47 0.00"
## [1] "c8-59 0.00"
## [1] "c8-62 4.11"
#item-by-item
#control group 2
itemlist.fc<-unique(subset(d.fc.ready.model,GroupNew=="control 2")$ItemNum)
#run models for each item and print z values
#controllist.fc <- c()
for (i in itemlist.fc){
m.fc<-summary(glm(answerC~1,data=subset(d.fc.ready.model,ItemNum==i),family = binomial(link = "logit")))$coef[, "z value"]
print(paste(i," ",formatC(m.fc,digits=2,format="f")))
}## [1] "c1-28 5.29"
## [1] "c3-f9 5.24"
## [1] "c4-32 3.15"
## [1] "c4-39 5.03"
## [1] "c5-18 4.75"
## [1] "c5-43 4.01"
## [1] "c5-f9 5.30"
## [1] "c8-40a 1.74"
## [1] "c8-93a 5.30"
#get non significent or reversed pairs
nonsigitem.fc.c2 <- c()
for (i in itemlist.fc){
m.fc<-summary(glm(answerC~1,data=subset(d.fc.ready.model,ItemNum==i),family = binomial(link = "logit")))$coef[, "z value"]
if (m.fc < 2){
print(paste(i," ",formatC(m.fc,digits=2,format="f")))
nonsigitem.fc.c2[i]<-i}
}## [1] "c8-40a 1.74"
#compare the choice of the good sentences in control and test group
contrasts(d.fc.ready.model$Group)<- contr.sum(2)
#control 1 test -1
m.controlvstest<-glmer(answerC~Group+(1+Group|SubID)+(1+Group|ItemNum),data=d.fc.ready.model,
family = binomial(link = "logit"),
control=glmerControl(optimizer="bobyqa"))## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge: degenerate Hessian with 1
## negative eigenvalues
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.922877 0.5875635 6.676517 2.446878e-11
## Group1 2.800748 0.5848996 4.788425 1.680957e-06
#People chose more good sentences (or less bad sentences ) for the control group than the test group. #item-by-item
#test group
itemlist.fc<-unique(subset(d.fc.ready.model,GroupNew=="test")$ItemNum)
#run models for each item and print z values
#controllist.fc <- c()
for (i in itemlist.fc){
m.fc<-summary(glm(answerC~1,data=subset(d.fc.ready.model,ItemNum==i),family = binomial(link = "logit")))$coef[, "z value"]
print(paste(i," ",formatC(m.fc,digits=2,format="f")))
}## [1] "c1-50a -5.29"
## [1] "c2-34 4.58"
## [1] "c3-54 3.37"
## [1] "c4-27 5.30"
## [1] "c4-56b 5.24"
## [1] "c6-24 5.30"
## [1] "c6-80a 5.29"
## [1] "c6-83 -4.40"
## [1] "c6-98 5.24"
## [1] "c6-99 2.22"
## [1] "c7-102 4.40"
## [1] "c7-28 -4.21"
## [1] "c8-08 -3.59"
## [1] "c8-60 1.49"
## [1] "c8-71b 4.21"
## [1] "c8-77 3.81"
## [1] "c8-93c 3.81"
#get non significent or reversed pairs
nonsigitem.fc <- c()
for (i in itemlist.fc){
m.fc<-summary(glm(answerC~1,data=subset(d.fc.ready.model,ItemNum==i),family = binomial(link = "logit")))$coef[, "z value"]
if (m.fc < 2){
print(paste(i," ",formatC(m.fc,digits=2,format="f")))
nonsigitem.fc[i]<-i}
}## [1] "c1-50a -5.29"
## [1] "c6-83 -4.40"
## [1] "c7-28 -4.21"
## [1] "c8-08 -3.59"
## [1] "c8-60 1.49"
#add highlight n.s. contrasts to
nonsigitemfc<-unname(nonsigitem.fc)
d.fc.ready %>%
mutate(answer=ifelse(choice==GoodChoice,"Good","Bad"))%>%
group_by(GroupNew,ItemNum,answer) %>%
tally()%>%
mutate(Proportion = n / sum(n),se = sqrt(Proportion*(1-Proportion)/n))%>%
filter(GroupNew=="test")->d.fc.testgroup
ggplot(data=d.fc.testgroup,aes(x=answer,y=Proportion))+
stat_summary(geom = "bar", fun.y = "mean", aes(fill=answer),color="black",position = position_dodge())+
geom_hline(yintercept = 0.5,colour="blue",linetype = 2)+
scale_y_continuous(labels = scales::percent_format(accuracy = 1))+
facet_wrap(~ItemNum,ncol = 6)+
geom_rect(data = subset(d.fc.testgroup,ItemNum %in% nonsigitemfc),
xmin = -Inf,xmax = Inf,ymin = -Inf,ymax = Inf,
colour="red",size=1.5,alpha = 0)+
labs(y="Proportion of Choice",title="Test group: Individual-item")+
scale_fill_discrete(name="Answer")+
theme_bw(base_size = 12)+
theme(legend.position = c(0.95, 0.09))->p.fc.test.hl
p.fc.test.hld.fc.ready %>%
mutate(answer=ifelse(choice==GoodChoice,"Good","Bad"))%>%
group_by(GroupNew,ItemNum,answer) %>%
tally()%>%
mutate(Proportion = n / sum(n),se = sqrt(Proportion*(1-Proportion)/n))%>%
filter(GroupNew=="control 2")->d.fc.control2
ggplot(data=d.fc.control2,aes(x=answer,y=Proportion))+
stat_summary(geom = "bar", fun.y = "mean", aes(fill=answer),color="black",position = position_dodge())+
geom_hline(yintercept = 0.5,colour="blue",linetype = 2)+
scale_y_continuous(labels = scales::percent_format(accuracy = 1))+
facet_wrap(~ItemNum,ncol = 5)+
geom_rect(data = subset(d.fc.control2,ItemNum %in% nonsigitem.fc.c2),
xmin = -Inf,xmax = Inf,ymin = -Inf,ymax = Inf,
colour="red",size=1.5,alpha = 0)+
labs(y="Proportion of Choice",title="Control 2 group: Individual-item")+
scale_fill_discrete(name="Answer")+
theme_bw(base_size = 12)+
theme(legend.position = c(0.95, 0.09))->p.fc.c2.hl
p.fc.c2.hl## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] emmeans_1.4.2 scales_1.0.0 lmerTest_3.1-0 lme4_1.1-17
## [5] Matrix_1.2-18 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
## [9] purrr_0.3.3 readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
## [13] ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-144 fs_1.3.1 lubridate_1.7.4
## [4] RColorBrewer_1.1-2 httr_1.4.1 numDeriv_2016.8-1.1
## [7] tools_3.6.3 backports_1.1.5 utf8_1.1.4
## [10] R6_2.4.0 rpart_4.1-15 mgcv_1.8-31
## [13] Hmisc_4.2-0 DBI_1.0.0 colorspace_1.4-1
## [16] nnet_7.3-12 withr_2.1.2 tidyselect_0.2.5
## [19] gridExtra_2.3 compiler_3.6.3 cli_1.1.0
## [22] rvest_0.3.5 htmlTable_1.13.2 xml2_1.2.2
## [25] sandwich_2.5-1 labeling_0.3 checkmate_1.9.4
## [28] mvtnorm_1.0-11 digest_0.6.22 foreign_0.8-75
## [31] minqa_1.2.4 rmarkdown_2.1 base64enc_0.1-3
## [34] pkgconfig_2.0.3 htmltools_0.4.0 dbplyr_1.4.2
## [37] htmlwidgets_1.5.1 rlang_0.4.1 readxl_1.3.1
## [40] rstudioapi_0.10 generics_0.0.2 zoo_1.8-6
## [43] jsonlite_1.6 acepack_1.4.1 magrittr_1.5
## [46] Formula_1.2-3 Rcpp_1.0.2 munsell_0.5.0
## [49] fansi_0.4.0 lifecycle_0.1.0 stringi_1.4.3
## [52] multcomp_1.4-10 yaml_2.2.0 MASS_7.3-51.5
## [55] grid_3.6.3 crayon_1.3.4 lattice_0.20-38
## [58] haven_2.2.0 splines_3.6.3 hms_0.5.2
## [61] zeallot_0.1.0 knitr_1.25 pillar_1.4.2
## [64] estimability_1.3 codetools_0.2-16 reprex_0.3.0
## [67] glue_1.3.1 evaluate_0.14 latticeExtra_0.6-28
## [70] data.table_1.12.6 modelr_0.1.5 vctrs_0.2.0
## [73] nloptr_1.2.1 cellranger_1.1.0 gtable_0.3.0
## [76] assertthat_0.2.1 xfun_0.10 xtable_1.8-4
## [79] broom_0.5.2 coda_0.19-3 survival_3.1-8
## [82] cluster_2.1.0 TH.data_1.0-10 ellipsis_0.3.0